# topic 23 Paired values # # Create population source("../gnrnd5.R") source("../gnrnd4.R") gnrnd5(57214499910, 31000011000075) # Generates L1 and L2 # These values represent paired measures on a single # population. Generally, this is a "before" and "after" # measure of some value, i.e., a measure before some # treatment and a measure after the treatment. # For example, perhaps we want to look at how eating # flaxseed affects a person's HDL measure. [HDL is # High Density Lipoprotien, the "good cholesterol] We could # take HDL measures on people, then have them eat # 3 tablespoons of flaxseed each day for a month, then # take a new measure of their HDL. Then we have a pair # of measures on those people, a "before" and an "after" # measure. # What we have done here is to create such a population # of measures. HDL_before <- L1 HDL_after <- L2 ######################################### ## It is somewhat silly to do the ### ## following if we have all the data,### ## but here we want to see how we ### ## can get a confidence interval for ### ## the change in HDL level and/or do ### ## a hypothesis test on H0: no change### ## versus H1: HDL went up. ### ######################################### # To generate a confidence interval for the change in # HDL levels we want to take a sample of the population. # generate the index values for a sample of size 31 gnrnd4( 885033001, 500000001) L1 pre_HDL <- HDL_before[ L1 ] post_HDL <- HDL_after[ L1 ] # look at the scores pre_HDL post_HDL # but these are paired values. We can find how much of # an increase each of these 31 participants showed HDL_increase <- post_HDL - pre_HDL HDL_increase # Now we want to get a 95% confidence interval for that # increase. But the increase is just a single value # for each participant. We are really back in the # case of getting a confidence interval for the # population mean based on a sample where we do not # know the population standard deviation. source( "../ci_unknown.R") ci_unknown( sd( HDL_increase), length(HDL_increase), mean(HDL_increase), 0.95 ) ################################### ## Now, since we actually have the measures for the ## entire population, let us get the true mean of ## the increase. ################################### pop_increase <- HDL_after - HDL_before mu <- mean( pop_increase ) mu # our sample generated a 95% confidence interval that # did not contain the true mean! What went wrong? # Nothing, it just turned out that this was one # of the 5% generated confidence intervals that do # not contain the true mean. # To show this, let us generate 10000 such # samples and get from them the 10000 confidence # intervals. Then, see how many of those # contain the true mean increase L1 <- 1:10000 for ( i in 1:10000 ) { this_samp <- sample( pop_increase, 31 ) this_interval <- ci_unknown( sd( this_samp), 31, mean(this_samp), 0.95) if( as.numeric(this_interval[1]) < mu & mu < as.numeric( this_interval[2]) ) { L1[i] <- "contains" } else { L1[i] <- "misses" } } table( L1 ) # should give around 5% misses ######################################### ## Hypotheses test ## ######################################### # In this class the only null hypothesis that # we will use for this type of problem is that # the mean difference in the values is 0. But, # for us, in this situation, that is H0: mu=0. # The alternative could be any of the usual # three suspects: >, < != # We will use H1: mu>0 # But, again, this is just a hypothesis test for the # mean based on a sample where we do not know the # population standard deviations. That was the case # when we would use hypoth_test_unknown(). # We will do this at the 0.05 level of significance. # Recall that our sample is still in HDL_increase HDL_increase source("../hypo_unknown.R") hypoth_test_unknown( 0, 1, 0.05, 31, mean( HDL_increase), sd(HDL_increase)) # this resounding rejection is not a shock since it # was this sample that gave us the confidence interval # that did not contain the true mean. # We could run this same test with 10000 samples to see # how often we would reject H0 based on those samples L1 <- 1:10000 for( i in 1:10000 ) { this_samp <- sample( pop_increase, 31 ) this_test <- hypoth_test_unknown( 0, 1, 0.05, 31, mean(this_samp), sd( this_samp)) L1[i] <- this_test[14] } table( L1 ) # And, again, we should not expect to have a 85% rejection # because the true mean difference is 2.0422, not 0. # Let us start with a new population, but this one where # there is little change, still a change, but not much of one gnrnd5(77344499910, 3000011000075) # Generates L1 and L2 HDL_before <- L1 HDL_after <- L2 pop_increase <- HDL_after - HDL_before mean( pop_increase ) # now try the demo of 10000 samples L1 <- 1:10000 for( i in 1:10000 ) { this_samp <- sample( pop_increase, 31 ) this_test <- hypoth_test_unknown( 0, 1, 0.05, 31, mean(this_samp), sd( this_samp)) L1[i] <- this_test[14] } table( L1 )